{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def lzw_encode(data):\n",
    "    code, code_bits = {bytes([i]): i for i in range(256)}, 8\n",
    "    buffer, buffer_bits = 0, 0\n",
    "    index, aux = 0, []\n",
    "\n",
    "    while index < len(data):\n",
    "        # find word\n",
    "        for j in range(index + 1, len(data) + 1):\n",
    "            word = data[index:j]\n",
    "\n",
    "            # store word\n",
    "            if word not in code:\n",
    "                code[word] = len(code)\n",
    "                word = word[:-1]\n",
    "                break\n",
    "\n",
    "        # write buffer\n",
    "        buffer <<= code_bits\n",
    "        buffer |= code[word]\n",
    "        buffer_bits += code_bits\n",
    "\n",
    "        # code length\n",
    "        if len(code) > 2 ** code_bits:\n",
    "            code_bits += 1\n",
    "\n",
    "        # shift\n",
    "        index += len(word)\n",
    "\n",
    "        # buffer alignment\n",
    "        if index >= len(data) and buffer_bits % 8:\n",
    "            r = 8 - (buffer_bits % 8)\n",
    "            buffer <<= r\n",
    "            buffer_bits += r\n",
    "\n",
    "        # emit output\n",
    "        if not buffer_bits % 8:\n",
    "            aux += int.to_bytes(buffer, buffer_bits >> 3, 'big')\n",
    "            buffer, buffer_bits = 0, 0\n",
    "\n",
    "    return bytes(aux)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def lzw_decode(data):\n",
    "    code, code_bits = {i: bytes([i]) for i in range(256)}, 8\n",
    "    buffer, buffer_bits = 0, 0\n",
    "    index, aux = 0, []\n",
    "    prefix = b''\n",
    "\n",
    "    while index < len(data) or buffer_bits >= code_bits:\n",
    "        # read buffer\n",
    "        while index < len(data) and buffer_bits < code_bits:\n",
    "            buffer <<= 8\n",
    "            buffer |= data[index]\n",
    "            buffer_bits += 8\n",
    "            index += 1\n",
    "\n",
    "        # find word\n",
    "        buffer_bits -= code_bits\n",
    "        key = buffer >> buffer_bits\n",
    "        buffer &= (1 << buffer_bits) - 1\n",
    "        word = code.get(key, prefix + prefix[:1])\n",
    "\n",
    "        # store word\n",
    "        if prefix:\n",
    "            code[len(code)] = prefix + word[:1]\n",
    "        prefix = word\n",
    "\n",
    "        # code length\n",
    "        if len(code) >= 2 ** code_bits:\n",
    "            code_bits += 1\n",
    "\n",
    "        # emit output\n",
    "        aux += word\n",
    "        \n",
    "    return bytes(aux)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def compress(data):\n",
    "    encoded = lzw_encode(data.encode('ASCII'))\n",
    "    decoded = lzw_decode(encoded).decode('ASCII')\n",
    "    assert data == decoded\n",
    "    \n",
    "    print('compression', len(data), '->', len(encoded), 'bytes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "compression 11 -> 9 bytes\n"
     ]
    }
   ],
   "source": [
    "compress('ATGATCATGAG')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "compression 1000 -> 51 bytes\n"
     ]
    }
   ],
   "source": [
    "compress('x' * 1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "compression 112 -> 84 bytes\n"
     ]
    }
   ],
   "source": [
    "compress(\"\"\"\n",
    "I wish that I knew what I know now\n",
    "When I was younger.\n",
    "I wish that I knew what I know now\n",
    "When I was stronger.\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
